Assignment 03

Author

Soham S Deshkhaire

Published

October 29, 2025

import pandas as pd
import plotly.express as px
import plotly.io as pio
from pyspark.sql import SparkSession
import re
import numpy as np
import plotly.graph_objects as go
from pyspark.sql.functions import col, split, explode, regexp_replace, transform, when
from pyspark.sql import functions as F
from pyspark.sql.functions import col, monotonically_increasing_id

np.random.seed(42)

pio.renderers.default = "notebook"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")
df.createOrReplaceTempView("job_postings")

# Show Schema and Sample Data
print("---This is Diagnostic check, No need to print it in the final doc---")

df.printSchema() # comment this line when rendering the submission
df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/30 01:22:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/30 01:23:12 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
---This is Diagnostic check, No need to print it in the final doc---
root
 |-- ID: string (nullable = true)
 |-- LAST_UPDATED_DATE: string (nullable = true)
 |-- LAST_UPDATED_TIMESTAMP: timestamp (nullable = true)
 |-- DUPLICATES: integer (nullable = true)
 |-- POSTED: string (nullable = true)
 |-- EXPIRED: string (nullable = true)
 |-- DURATION: integer (nullable = true)
 |-- SOURCE_TYPES: string (nullable = true)
 |-- SOURCES: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- ACTIVE_URLS: string (nullable = true)
 |-- ACTIVE_SOURCES_INFO: string (nullable = true)
 |-- TITLE_RAW: string (nullable = true)
 |-- BODY: string (nullable = true)
 |-- MODELED_EXPIRED: string (nullable = true)
 |-- MODELED_DURATION: integer (nullable = true)
 |-- COMPANY: integer (nullable = true)
 |-- COMPANY_NAME: string (nullable = true)
 |-- COMPANY_RAW: string (nullable = true)
 |-- COMPANY_IS_STAFFING: boolean (nullable = true)
 |-- EDUCATION_LEVELS: string (nullable = true)
 |-- EDUCATION_LEVELS_NAME: string (nullable = true)
 |-- MIN_EDULEVELS: integer (nullable = true)
 |-- MIN_EDULEVELS_NAME: string (nullable = true)
 |-- MAX_EDULEVELS: integer (nullable = true)
 |-- MAX_EDULEVELS_NAME: string (nullable = true)
 |-- EMPLOYMENT_TYPE: integer (nullable = true)
 |-- EMPLOYMENT_TYPE_NAME: string (nullable = true)
 |-- MIN_YEARS_EXPERIENCE: integer (nullable = true)
 |-- MAX_YEARS_EXPERIENCE: integer (nullable = true)
 |-- IS_INTERNSHIP: boolean (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- REMOTE_TYPE: integer (nullable = true)
 |-- REMOTE_TYPE_NAME: string (nullable = true)
 |-- ORIGINAL_PAY_PERIOD: string (nullable = true)
 |-- SALARY_TO: integer (nullable = true)
 |-- SALARY_FROM: integer (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITY_NAME: string (nullable = true)
 |-- COUNTY: integer (nullable = true)
 |-- COUNTY_NAME: string (nullable = true)
 |-- MSA: integer (nullable = true)
 |-- MSA_NAME: string (nullable = true)
 |-- STATE: integer (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- COUNTY_OUTGOING: integer (nullable = true)
 |-- COUNTY_NAME_OUTGOING: string (nullable = true)
 |-- COUNTY_INCOMING: integer (nullable = true)
 |-- COUNTY_NAME_INCOMING: string (nullable = true)
 |-- MSA_OUTGOING: integer (nullable = true)
 |-- MSA_NAME_OUTGOING: string (nullable = true)
 |-- MSA_INCOMING: integer (nullable = true)
 |-- MSA_NAME_INCOMING: string (nullable = true)
 |-- NAICS2: integer (nullable = true)
 |-- NAICS2_NAME: string (nullable = true)
 |-- NAICS3: integer (nullable = true)
 |-- NAICS3_NAME: string (nullable = true)
 |-- NAICS4: integer (nullable = true)
 |-- NAICS4_NAME: string (nullable = true)
 |-- NAICS5: integer (nullable = true)
 |-- NAICS5_NAME: string (nullable = true)
 |-- NAICS6: integer (nullable = true)
 |-- NAICS6_NAME: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- TITLE_NAME: string (nullable = true)
 |-- TITLE_CLEAN: string (nullable = true)
 |-- SKILLS: string (nullable = true)
 |-- SKILLS_NAME: string (nullable = true)
 |-- SPECIALIZED_SKILLS: string (nullable = true)
 |-- SPECIALIZED_SKILLS_NAME: string (nullable = true)
 |-- CERTIFICATIONS: string (nullable = true)
 |-- CERTIFICATIONS_NAME: string (nullable = true)
 |-- COMMON_SKILLS: string (nullable = true)
 |-- COMMON_SKILLS_NAME: string (nullable = true)
 |-- SOFTWARE_SKILLS: string (nullable = true)
 |-- SOFTWARE_SKILLS_NAME: string (nullable = true)
 |-- ONET: string (nullable = true)
 |-- ONET_NAME: string (nullable = true)
 |-- ONET_2019: string (nullable = true)
 |-- ONET_2019_NAME: string (nullable = true)
 |-- CIP6: string (nullable = true)
 |-- CIP6_NAME: string (nullable = true)
 |-- CIP4: string (nullable = true)
 |-- CIP4_NAME: string (nullable = true)
 |-- CIP2: string (nullable = true)
 |-- CIP2_NAME: string (nullable = true)
 |-- SOC_2021_2: string (nullable = true)
 |-- SOC_2021_2_NAME: string (nullable = true)
 |-- SOC_2021_3: string (nullable = true)
 |-- SOC_2021_3_NAME: string (nullable = true)
 |-- SOC_2021_4: string (nullable = true)
 |-- SOC_2021_4_NAME: string (nullable = true)
 |-- SOC_2021_5: string (nullable = true)
 |-- SOC_2021_5_NAME: string (nullable = true)
 |-- LOT_CAREER_AREA: integer (nullable = true)
 |-- LOT_CAREER_AREA_NAME: string (nullable = true)
 |-- LOT_OCCUPATION: integer (nullable = true)
 |-- LOT_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_SPECIALIZED_OCCUPATION: integer (nullable = true)
 |-- LOT_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_OCCUPATION_GROUP: integer (nullable = true)
 |-- LOT_OCCUPATION_GROUP_NAME: string (nullable = true)
 |-- LOT_V6_SPECIALIZED_OCCUPATION: integer (nullable = true)
 |-- LOT_V6_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_V6_OCCUPATION: integer (nullable = true)
 |-- LOT_V6_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_V6_OCCUPATION_GROUP: integer (nullable = true)
 |-- LOT_V6_OCCUPATION_GROUP_NAME: string (nullable = true)
 |-- LOT_V6_CAREER_AREA: integer (nullable = true)
 |-- LOT_V6_CAREER_AREA_NAME: string (nullable = true)
 |-- SOC_2: string (nullable = true)
 |-- SOC_2_NAME: string (nullable = true)
 |-- SOC_3: string (nullable = true)
 |-- SOC_3_NAME: string (nullable = true)
 |-- SOC_4: string (nullable = true)
 |-- SOC_4_NAME: string (nullable = true)
 |-- SOC_5: string (nullable = true)
 |-- SOC_5_NAME: string (nullable = true)
 |-- LIGHTCAST_SECTORS: string (nullable = true)
 |-- LIGHTCAST_SECTORS_NAME: string (nullable = true)
 |-- NAICS_2022_2: integer (nullable = true)
 |-- NAICS_2022_2_NAME: string (nullable = true)
 |-- NAICS_2022_3: integer (nullable = true)
 |-- NAICS_2022_3_NAME: string (nullable = true)
 |-- NAICS_2022_4: integer (nullable = true)
 |-- NAICS_2022_4_NAME: string (nullable = true)
 |-- NAICS_2022_5: integer (nullable = true)
 |-- NAICS_2022_5_NAME: string (nullable = true)
 |-- NAICS_2022_6: integer (nullable = true)
 |-- NAICS_2022_6_NAME: string (nullable = true)
                                                                                
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|                  ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES|  POSTED|  EXPIRED|DURATION|        SOURCE_TYPES|             SOURCES|                 URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO|           TITLE_RAW|                BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY|        COMPANY_NAME|COMPANY_RAW|COMPANY_IS_STAFFING|EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP|SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM|            LOCATION|                CITY|    CITY_NAME|COUNTY|   COUNTY_NAME|  MSA|            MSA_NAME|STATE|STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING|   MSA_NAME_OUTGOING|MSA_INCOMING|   MSA_NAME_INCOMING|NAICS2|         NAICS2_NAME|NAICS3|         NAICS3_NAME|NAICS4|         NAICS4_NAME|NAICS5|         NAICS5_NAME|NAICS6|         NAICS6_NAME|             TITLE|         TITLE_NAME|         TITLE_CLEAN|              SKILLS|         SKILLS_NAME|  SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME|      CERTIFICATIONS| CERTIFICATIONS_NAME|       COMMON_SKILLS|  COMMON_SKILLS_NAME|     SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME|      ONET|           ONET_NAME| ONET_2019|      ONET_2019_NAME|                CIP6|           CIP6_NAME|                CIP4|           CIP4_NAME|                CIP2|           CIP2_NAME|SOC_2021_2|     SOC_2021_2_NAME|SOC_2021_3|     SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME|  SOC_2|          SOC_2_NAME|  SOC_3|          SOC_3_NAME|  SOC_4|     SOC_4_NAME|  SOC_5|     SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2|   NAICS_2022_2_NAME|NAICS_2022_3|   NAICS_2022_3_NAME|NAICS_2022_4|   NAICS_2022_4_NAME|NAICS_2022_5|   NAICS_2022_5_NAME|NAICS_2022_6|   NAICS_2022_6_NAME|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|1f57d95acf4dc67ed...|         9/6/2024|  2024-09-06 20:32:...|         0|6/2/2024| 6/8/2024|       6|   [\n  "Company"\n]|[\n  "brassring.c...|[\n  "https://sjo...|         []|               NULL|Enterprise Analys...|31-May-2024\n\nEn...|       6/8/2024|               6|  894731|          Murphy USA| Murphy USA|              false|       [\n  2\n]| [\n  "Bachelor's ...|            2|  Bachelor's degree|         NULL|              NULL|              1|Full-time (> 32 h...|                   2|                   2|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 33.20...|RWwgRG9yYWRvLCBBUg==|El Dorado, AR|  5139|     Union, AR|20980|       El Dorado, AR|    5|  Arkansas|           5139|           Union, AR|           5139|           Union, AR|       20980|       El Dorado, AR|       20980|       El Dorado, AR|    44|        Retail Trade|   441|Motor Vehicle and...|  4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4|Enterprise Analysts|enterprise analys...|[\n  "KS126DB6T06...|[\n  "Merchandisi...|[\n  "KS126DB6T06...|   [\n  "Merchandisi...|                  []|                  []|[\n  "KS126706DPF...|[\n  "Mathematics...|[\n  "KS440W865GC...|[\n  "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n  "45.0601",\n...|[\n  "Economics, ...|[\n  "45.06",\n  ...|[\n  "Economics",...|[\n  "45",\n  "27...|[\n  "Social Scie...|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101011|           General ERP Analy...|                2310|     Business Intellig...|                     23101011|              General ERP Analy...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|        [\n  7\n]|  [\n  "Artificial ...|          44|        Retail Trade|         441|Motor Vehicle and...|        4413|Automotive Parts,...|       44133|Automotive Parts ...|      441330|Automotive Parts ...|
|0cb072af26757b6c4...|         8/2/2024|  2024-08-02 17:08:...|         0|6/2/2024| 8/1/2024|    NULL| [\n  "Job Board"\n]| [\n  "maine.gov"\n]|[\n  "https://job...|         []|               NULL|Oracle Consultant...|Oracle Consultant...|       8/1/2024|            NULL|  133098|Smx Corporation L...|        SMX|               true|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              1|Full-time (> 32 h...|                   3|                   3|        false|  NULL|          1|          Remote|               NULL|     NULL|       NULL|{\n  "lat": 44.31...|    QXVndXN0YSwgTUU=|  Augusta, ME| 23011|  Kennebec, ME|12300|Augusta-Watervill...|   23|     Maine|          23011|        Kennebec, ME|          23011|        Kennebec, ME|       12300|Augusta-Watervill...|       12300|Augusta-Watervill...|    56|Administrative an...|   561|Administrative an...|  5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n  "KS122626T55...|[\n  "Procurement...|[\n  "KS122626T55...|   [\n  "Procurement...|                  []|                  []|                  []|                  []|[\n  "BGSBF3F508F...|[\n  "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101012|           Oracle Consultant...|                2310|     Business Intellig...|                     23101012|              Oracle Consultant...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          56|Administrative an...|         561|Administrative an...|        5613| Employment Services|       56132|Temporary Help Se...|      561320|Temporary Help Se...|
|85318b12b3331fa49...|         9/6/2024|  2024-09-06 20:32:...|         1|6/2/2024| 7/7/2024|      35| [\n  "Job Board"\n]|[\n  "dejobs.org"\n]|[\n  "https://dej...|         []|               NULL|        Data Analyst|Taking care of pe...|      6/10/2024|               8|39063746|            Sedgwick|   Sedgwick|              false|       [\n  2\n]| [\n  "Bachelor's ...|            2|  Bachelor's degree|         NULL|              NULL|              1|Full-time (> 32 h...|                   5|                NULL|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 32.77...|    RGFsbGFzLCBUWA==|   Dallas, TX| 48113|    Dallas, TX|19100|Dallas-Fort Worth...|   48|     Texas|          48113|          Dallas, TX|          48113|          Dallas, TX|       19100|Dallas-Fort Worth...|       19100|Dallas-Fort Worth...|    52|Finance and Insur...|   524|Insurance Carrier...|  5242|Agencies, Brokera...| 52429|Other Insurance R...|524291|    Claims Adjusting|ET3037E0C947A02404|      Data Analysts|        data analyst|[\n  "KS1218W78FG...|[\n  "Management"...|[\n  "ESF3939CE1F...|   [\n  "Exception R...|[\n  "KS683TN76T7...|[\n  "Security Cl...|[\n  "KS1218W78FG...|[\n  "Management"...|[\n  "KS126HY6YLT...|[\n  "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231113|Data / Data Minin...|                  23111310|                   Data Analyst|                2311|     Data Analysis and...|                     23111310|                      Data Analyst|           231113|  Data / Data Minin...|                   2311|        Data Analysis and...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          52|Finance and Insur...|         524|Insurance Carrier...|        5242|Agencies, Brokera...|       52429|Other Insurance R...|      524291|    Claims Adjusting|
|1b5c3941e54a1889e...|         9/6/2024|  2024-09-06 20:32:...|         1|6/2/2024|7/20/2024|      48| [\n  "Job Board"\n]|[\n  "disabledper...|[\n  "https://www...|         []|               NULL|Sr. Lead Data Mgm...|About this role:\...|      6/12/2024|              10|37615159|         Wells Fargo|Wells Fargo|              false|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              1|Full-time (> 32 h...|                   3|                NULL|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 33.44...|    UGhvZW5peCwgQVo=|  Phoenix, AZ|  4013|  Maricopa, AZ|38060|Phoenix-Mesa-Chan...|    4|   Arizona|           4013|        Maricopa, AZ|           4013|        Maricopa, AZ|       38060|Phoenix-Mesa-Chan...|       38060|Phoenix-Mesa-Chan...|    52|Finance and Insur...|   522|Credit Intermedia...|  5221|Depository Credit...| 52211|  Commercial Banking|522110|  Commercial Banking|ET2114E0404BA30075|Management Analysts|sr lead data mgmt...|[\n  "KS123QX62QY...|[\n  "Exit Strate...|[\n  "KS123QX62QY...|   [\n  "Exit Strate...|                  []|                  []|[\n  "KS7G6NP6R6L...|[\n  "Reliability...|[\n  "KS4409D76NW...|[\n  "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231113|Data / Data Minin...|                  23111310|                   Data Analyst|                2311|     Data Analysis and...|                     23111310|                      Data Analyst|           231113|  Data / Data Minin...|                   2311|        Data Analysis and...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|        [\n  6\n]|  [\n  "Data Privac...|          52|Finance and Insur...|         522|Credit Intermedia...|        5221|Depository Credit...|       52211|  Commercial Banking|      522110|  Commercial Banking|
|cb5ca25f02bdf25c1...|        6/19/2024|   2024-06-19 07:00:00|         0|6/2/2024|6/17/2024|      15|[\n  "FreeJobBoar...|[\n  "craigslist....|[\n  "https://mod...|         []|               NULL|Comisiones de $10...|Comisiones de $10...|      6/17/2024|              15|       0|        Unclassified|      LH/GM|              false|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              3|Part-time / full-...|                NULL|                NULL|        false| 92500|          0|          [None]|               year|   150000|      35000|{\n  "lat": 37.63...|    TW9kZXN0bywgQ0E=|  Modesto, CA|  6099|Stanislaus, CA|33700|         Modesto, CA|    6|California|           6099|      Stanislaus, CA|           6099|      Stanislaus, CA|       33700|         Modesto, CA|       33700|         Modesto, CA|    99|Unclassified Indu...|   999|Unclassified Indu...|  9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000|       Unclassified|comisiones de por...|                  []|                  []|                  []|                     []|                  []|                  []|                  []|                  []|                  []|                  []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101012|           Oracle Consultant...|                2310|     Business Intellig...|                     23101012|              Oracle Consultant...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          99|Unclassified Indu...|         999|Unclassified Indu...|        9999|Unclassified Indu...|       99999|Unclassified Indu...|      999999|Unclassified Indu...|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
only showing top 5 rows
companies_df = df.select(
    col("company"),
    col("company_name"),
    col("company_raw"),
    col("company_is_staffing")
).distinct().withColumn("company_id", monotonically_increasing_id())

companies = companies_df.toPandas()
companies.drop(columns=["company"], inplace=True)
companies.to_csv("./output/companies.csv", index=False)
companies.head()  
                                                                                
company_name company_raw company_is_staffing company_id
0 Crowe Crowe False 0
1 The Devereux Foundation The Devereux Foundation False 1
2 Elder Research Elder Research False 2
3 NTT DATA NTT DATA Inc False 3
4 Frederick National Laboratory For Cancer Research Frederick National Laboratory for Cancer Research False 4
df = df.withColumn("SALARY_FROM", col("SALARY_FROM").cast("float")) \
       .withColumn("SALARY", col("SALARY").cast("float")) \
       .withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float")) \
       .withColumn("MIN_YEARS_EXPERIENCE", col("MIN_YEARS_EXPERIENCE").cast("float")) \
       .withColumn("SALARY_TO", col("SALARY_TO").cast("float")) \

#Step 2: Computing medians for salary columns
def compute_median(sdf, col_name):
  q = sdf.approxQuantile(col_name, [0.5], 0.01)
  return q[0] if q else None

median_from = compute_median(df, "SALARY_FROM")
median_to = compute_median(df, "SALARY_TO")
median_salary =  compute_median(df, "SALARY")

print("Medians :", median_from, median_to, median_salary)

# df = df.withColumn("SALARY", when(
#     col("SALARY").isNotNull() & (col("SALARY") > 0),
#     col("SALARY")
# ).otherwise((col("SALARY_FROM") + col("SALARY_TO")) / 2))

# df = df.withColumn("SALARY", (col("SALARY_FROM") + col("SALARY_TO")) / 2)

df = df.fillna({
    "SALARY_FROM": median_from,
    "SALARY_TO": median_to,
})

df = df.withColumn("Average_Salary", (col("SALARY_FROM") + col("SALARY_TO")) / 2)

export_cols = [
  "EDUCATION_LEVELS_NAME",
  "REMOTE_TYPE_NAME",
  "MAX_YEARS_EXPERIENCE",
  "Average_Salary",
  "SALARY",
  "LOT_V6_SPECIALIZED_OCCUPATION_NAME"
]
df_selected = df.select(*export_cols)

pdf = df_selected.toPandas()
pdf.to_csv("./data/lightcast_cleaned.csv", index=False)

print("Data cleaning complete. Rows retained:", len(pdf))
                                                                                
Medians : 87295.0 130042.0 115024.0
                                                                                
Data cleaning complete. Rows retained: 72498
import pandas as pd
#filter out missing or zero salary values
from pyspark.sql.functions import col
pdf = df.filter(col("SALARY") > 0).select("EMPLOYMENT_TYPE_NAME", "SALARY").toPandas()
pdf.head()
                                                                                
EMPLOYMENT_TYPE_NAME SALARY
0 Part-time / full-time 92500.0
1 Full-time (> 32 hours) 110155.0
2 Full-time (> 32 hours) 92962.0
3 Full-time (> 32 hours) 107645.0
4 Full-time (> 32 hours) 192800.0
import re

pdf["EMPLOYMENT_TYPE_NAME"] = pdf["EMPLOYMENT_TYPE_NAME"].apply(
    lambda x: re.sub(r"[^\x00-\x7F]+", "", x) if isinstance(x, str) else x
)
pdf.head()
EMPLOYMENT_TYPE_NAME SALARY
0 Part-time / full-time 92500.0
1 Full-time (> 32 hours) 110155.0
2 Full-time (> 32 hours) 92962.0
3 Full-time (> 32 hours) 107645.0
4 Full-time (> 32 hours) 192800.0
# computer median salary for sorting
median_salary = pdf.groupby("EMPLOYMENT_TYPE_NAME")["SALARY"].median()
median_salary.head()
EMPLOYMENT_TYPE_NAME
Full-time (> 32 hours)    116500.0
Part-time ( 32 hours)      86390.0
Part-time / full-time     100000.0
Name: SALARY, dtype: float32
# Computer median salary for sorting
median_salaries = pdf.groupby("EMPLOYMENT_TYPE_NAME")["SALARY"].median()
median_salaries.head()
EMPLOYMENT_TYPE_NAME
Full-time (> 32 hours)    116500.0
Part-time ( 32 hours)      86390.0
Part-time / full-time     100000.0
Name: SALARY, dtype: float32
sorted_employment_types = median_salaries.sort_values(ascending=False).index
pdf["EMPLOYMENT_TYPE_NAME"] = pd.Categorical(
  pdf["EMPLOYMENT_TYPE_NAME"],
  categories=sorted_employment_types,
  ordered=True
)

# Create Box Plot with horizontal grid lines
fig = px.box(
    pdf,
    x="EMPLOYMENT_TYPE_NAME",
    y="SALARY",
    title="Salary Distribution by Employment Type",
    color_discrete_sequence=["black"],
    boxmode="group",
    points="all",
)

#improve layout, font styles, and axis labels

fig.update_layout(
    title=dict(
        text="Salary Distribution by Employment Type",
        font=dict(size=30, family="Arial", color="black", weight="bold")
    ),
    xaxis=dict(
        title=dict(text="Employment Type", font=dict(size=24, family="Arial", color="black", weight="bold")),
        tickangle=0,
        tickfont=dict(size=18, family="Arial", color="black", weight="bold"),
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        showgrid=False,
        categoryorder="array",
        categoryarray=sorted_employment_types.tolist()
    ),
    yaxis=dict(
        title=dict(text="Salary (K $)", font=dict(size=24, family="Arial", color="black", weight="bold")),
        tickvals=[0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000],
        ticktext=["0", "50K", "100K", "150K", "200K", "250K", "300K", "350K", "400K", "450K", "500K"],
        tickfont=dict(size=18, family="Arial", color="black", weight="bold"),
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        showgrid=True,
        gridcolor="lightgrey",
        gridwidth=0.5
    ),
    font=dict(family="Arial", size=16, color="black"),
    boxgap=0.7,
    plot_bgcolor="white",
    paper_bgcolor="white",
    showlegend=False,
    height=500,
    width=850
)
fig.show()
fig.write_html("./output/boxplot_employment_type_salary.html")
import plotly.express as px

pdf = df.select("NAICS2_NAME", "SALARY").toPandas()

fig = px.box(
    pdf,
    x="NAICS2_NAME",
    y="SALARY",
    title="Salary Distribution by Industry",
    color_discrete_sequence=["#00ff00"]
)

fig.update_layout(
    font_family="Calibri",
    title_font_size=16,
    width=1300,
    height=900,
    yaxis=dict(
        tickvals=[0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000],
        ticktext=["0", "50K", "100K", "150K", "200K", "250K", "300K", "350K", "400K", "450K", "500K"],
        tickfont=dict(size=12, family="Calibri", color="black")
    )
)

fig.update_xaxes(
    title="Industry",
    tickangle=45,
    tickfont=dict(size=12, family="Calibri", color="black")
)

fig.show()
                                                                                
from pyspark.sql.functions import col, expr, count
import plotly.express as px

valid_df = df.filter(
    col("SALARY").isNotNull() & (col("SALARY") > 0) &
    col("LOT_V6_SPECIALIZED_OCCUPATION_NAME").isNotNull()
)

valid_df = df.filter(
    col("SALARY").isNotNull() & (col("SALARY") > 0) &
    col("LOT_V6_SPECIALIZED_OCCUPATION_NAME").isNotNull() &
    (col("LOT_V6_SPECIALIZED_OCCUPATION_NAME") != "")
)

agg_df = df.filter(
    col("SALARY").isNotNull() & (col("SALARY") > 0) &
    col("LOT_V6_SPECIALIZED_OCCUPATION_NAME").isNotNull() &
    (col("LOT_V6_SPECIALIZED_OCCUPATION_NAME") != "")
).groupBy("LOT_V6_SPECIALIZED_OCCUPATION_NAME") \
 .agg(
    expr("percentile_approx(SALARY, 0.5, 100)").alias("Median_Salary"),
    count("*").alias("Job_Postings")
).withColumnRenamed("LOT_V6_SPECIALIZED_OCCUPATION_NAME", "Occupation_Name")

# Converting to pandas
bubble_pdf = agg_df.toPandas()

# Buble Chart
fig = px.scatter(
    bubble_pdf,
    x="Occupation_Name",
    y="Median_Salary",
    size="Job_Postings",
    color="Median_Salary",
    color_continuous_scale="sunset",
    title="Salary Analysis by ONET Occupation Type",
    hover_name="Occupation_Name",
    size_max=60,
    template="plotly_white"
)

fig.update_layout(
    xaxis_title="Occupation Type",
    yaxis_title="Median Salary",
    width=1400,
    height=800,
    font=dict(family="Calibri", size=14),
    title_font=dict(size=28),
    margin=dict(l=40, r=40, t=80, b=120)
)

fig.show()
                                                                                
from pyspark.sql.functions import col, when

lower_deg = ["Bachelor's", "Associate", "GED", "No Education Listed", "High school"]
higher_deg = ["Master's degree", "Ph.D. or professional degree"]

# Create education group column
df = df.withColumn(
    "EDU_GROUP",
    when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}" for deg in lower_deg])), "Bachelor's or lower")
    .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}" for deg in higher_deg])), "Master's or PhD")
    .otherwise("Other")
)

# Cast columns properly
df = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))
df = df.withColumn("Salary", col("Salary").cast("float"))

# Filter valid rows
df = df.filter(
    col("MAX_YEARS_EXPERIENCE").isNotNull() &
    col("Salary").isNotNull() &
    (col("MAX_YEARS_EXPERIENCE") > 0) &
    (col("Salary") > 0)
)

# Filter by education group
df_filtered = df.filter(col("EDU_GROUP").isin(["Bachelor's or lower", "Master's or PhD"]))
# Convert to pandas
df_pd = df_filtered.toPandas()
                                                                                
import numpy as np

# Add jitter to experience values
df_pd["EXPERIENCE_JITTER"] = df_pd["MAX_YEARS_EXPERIENCE"] + np.random.uniform(-0.3, 0.3, size=len(df_pd))
fig1 = px.scatter(
    df_pd,
    x="EXPERIENCE_JITTER",
    y="Salary",
    color="EDU_GROUP",
    hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
    title="Experience vs Salary by Education Level",
    opacity=1,
    color_discrete_sequence=["#d62728", "#ff7f0e"],
    width=1300,
    height=600
)

fig1.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))

fig1.update_layout(
    plot_bgcolor="#f9f9f9",
    paper_bgcolor="#fff5dc",
    font=dict(family="Segoe UI", size=14),
    title_font=dict(size=22),
    xaxis_title="Years of Experience",
    yaxis_title="Average Salary (USD)",
    legend_title="Education Group",
    hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"),
    margin=dict(t=70, b=60, l=60, r=60),
    xaxis=dict(gridcolor="gray", tickmode="linear", dtick=1),
    yaxis=dict(gridcolor="gray")
)
# Needed to use iframe due to it being unable to render in preview with just show and not specifiying renderer
fig1.show()
fig1.write_html("output/q_1a_experience_vs_salary_by_education_level.html")
from pyspark.sql.functions import col, when

lowest_deg = ["Associate", "GED", "No Education Listed", "High school"]
bach_deg = ["Bachelor's"]
mast_deg = ["Master's degree"]
phd_deg = ["Ph.D. or professional degree"]

# Create education group column
df = df.withColumn(
    "EDU_GROUP",
    when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}" for deg in lowest_deg])), "Associate's or lower")
    .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}" for deg in bach_deg])), "Bachelor's")
    .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}" for deg in mast_deg])), "Master's")
    .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}" for deg in phd_deg])), "PhD")
    .otherwise("Other")
)

# Cast columns properly
df = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))
df = df.withColumn("Salary", col("Salary").cast("float"))

# Filter valid rows
df = df.filter(
    col("MAX_YEARS_EXPERIENCE").isNotNull() &
    col("Salary").isNotNull() &
    (col("MAX_YEARS_EXPERIENCE") > 0) &
    (col("Salary") > 0)
)

# Filter by education group
df_filtered2 = df.filter(col("EDU_GROUP").isin(["Associate's or lower", "Bachelor's", "Master's", "PhD"]))

# Convert to pandas
df_pd2 = df_filtered2.toPandas()
                                                                                
import numpy as np

# Add jitter to experience values
df_pd2["EXPERIENCE_JITTER"] = df_pd2["MAX_YEARS_EXPERIENCE"] + np.random.uniform(-0.3, 0.3, size=len(df_pd2))
fig2 = px.scatter(
    df_pd2,
    x="EXPERIENCE_JITTER",  # use jittered x-axis
    y="Salary",
    color="EDU_GROUP",
    hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
    title="Experience vs Salary by Education Level",
    opacity=1,
    color_discrete_sequence=["#ff7f0e", "#17becf", "#d62728", "#e377c2", "#1f77b4"],
    width=1300,
    height=600
)

fig2.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))

fig2.update_layout(
    plot_bgcolor="#f9f9f9",
    paper_bgcolor="#fff5dc",
    font=dict(family="Segoe UI", size=14),
    title_font=dict(size=22),
    xaxis_title="Years of Experience",
    yaxis_title="Average Salary (USD)",
    legend_title="Education Group",
    hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"),
    margin=dict(t=70, b=60, l=60, r=60),
    xaxis=dict(gridcolor="gray", tickmode="linear", dtick=1),
    yaxis=dict(gridcolor="gray")
)
# Needed to use iframe due to it being unable to render in preview with just show and not specifiying renderer
fig2.show()
fig2.write_html("output/q_1a_experience_vs_salary_by_education_level_4.html")
from pyspark.sql.functions import col, when, trim
#Step 1
df = df.withColumn("REMOTE_GROUP",
    when(trim(col("REMOTE_TYPE_NAME")) == "Remote", "Remote")
    .when(trim(col("REMOTE_TYPE_NAME")) == "Hybrid Remote", "Hybrid")
    .when(trim(col("REMOTE_TYPE_NAME")) == "Not Remote", "Onsite")
    .when(col("REMOTE_TYPE_NAME").isNull(), "Onsite")
    .otherwise("Onsite")
)

#Step 2: Filter rows with valid values
df = df.filter(
    col("MAX_YEARS_EXPERIENCE").isNotNull() &
    col("Average_Salary").isNotNull() &
    (col("MAX_YEARS_EXPERIENCE") > 0) &
    (col("Average_Salary") > 0) 
)

# Step 3: Convert to pandas
df_pd = df.select(
    "MAX_YEARS_EXPERIENCE", "Average_Salary",
    "LOT_V6_SPECIALIZED_OCCUPATION_NAME", "REMOTE_GROUP"
).toPandas()

df_pd.head()
                                                                                
MAX_YEARS_EXPERIENCE Average_Salary LOT_V6_SPECIALIZED_OCCUPATION_NAME REMOTE_GROUP
0 2.0 92962.0 Data Analyst Onsite
1 2.0 75026.0 Oracle Consultant / Analyst Onsite
2 1.0 60923.0 Data Analyst Remote
3 2.0 131100.0 Enterprise Architect Onsite
4 3.0 136950.0 Enterprise Architect Remote
df_pd["MAX_YEARS_EXPERIENCE_JITTER"] = df_pd["MAX_YEARS_EXPERIENCE"] + np.random.uniform(-0.25,0.25,size=len(df_pd))
df_pd["AVERAGE_SALARY_JITTER"] = df_pd["Average_Salary"] + np.random.uniform(-2500,2500,size=len(df_pd))
df_pd = df_pd.round(2)
df_pd.head()

# removing outliers
df_pd = df_pd[df_pd["AVERAGE_SALARY_JITTER"] < 399000]
fig4 = px.scatter(
    df_pd,
    x="MAX_YEARS_EXPERIENCE_JITTER",  # use jittered x-axis
    y="AVERAGE_SALARY_JITTER",
    color="REMOTE_GROUP",
    hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
    title="<b>Experience vs Average Salary by Remote Work Type<b>",
    opacity=1,
    color_discrete_sequence=["#1f77b4", "#ff7f0e", "#2ca02c"],
    width=1300,
    height=600
)

fig4.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))

fig4.update_layout(
    plot_bgcolor="#f9f9f9",
    paper_bgcolor="#fff5dc",
    font=dict(family="Segoe UI", size=14),
    title_font=dict(size=22),
    xaxis_title="Years of Experience",
    yaxis_title="Average Salary (USD)",
    legend_title="Remote Work Type",
    hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"),
    margin=dict(t=70, b=60, l=60, r=60),
    xaxis=dict(gridcolor="gray", tickmode="linear", dtick=1),
    yaxis=dict(gridcolor="gray"),
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)

fig4.show()
import plotly.express as px

color_map = {
    "Remote": "#1f77b4",  # Blue
    "Hybrid": "#ff7f0e",  # Orange
    "Onsite": "#2ca02c"   # Green  
}

for group in ["Remote", "Hybrid", "Onsite"]:
    subset = df_pd[df_pd["REMOTE_GROUP"] == group]
    
    fig = px.histogram(
        subset,
        x="AVERAGE_SALARY_JITTER",
        nbins=30,
        title=f"Salary Distribution for {group} Jobs",
        color_discrete_sequence=[color_map[group]],
        width=800,
        height=500
    )

    fig.update_layout(
        plot_bgcolor="#f9f9f9",
        paper_bgcolor="#fff5dc",
        font=dict(family="Segoe UI", size=14),
        title_font=dict(size=22),
        xaxis_title="Average Salary (USD)",
        yaxis_title="Count",
        hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"),
        margin=dict(t=70, b=60, l=60, r=60),
        xaxis=dict(gridcolor="gray"),
        yaxis=dict(gridcolor="gray")
    )
    fig.show()
    fig.write_html(f"output/salary_distribution_{group.lower()}.html")